from sklearn.preprocessing import (FunctionTransformer,
QuantileTransformer,
MinMaxScaler,
RobustScaler,
StandardScaler,
KBinsDiscretizer,
)
%matplotlib inline
from pathlib import Path
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image
import PIL
import cv2
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import KBinsDiscretizer, minmax_scale
import glob
from keras.applications import Xception
from keras.applications.xception import preprocess_input
def transform_image(arr, f, **kwargs):
arr = np.array(arr)
assert arr.ndim == 2
transformed_images = []
for i in range(arr.shape[1]):
transformed_images_i = []
for image in arr[:,i]:
image_shape = image.shape
transformed = f(image.flatten(), **kwargs)
#shape preserving tranfsormation
if isinstance(transformed, np.ndarray):
if np.prod(transformed.shape) == np.prod(image_shape):
transformed = transformed.reshape(image_shape)
else:
pass
transformed_images_i.append(transformed)
transformed_images.append(transformed_images_i)
images = np.array(transformed_images).T
return images
class ImageTransformer(FunctionTransformer):
def __init__(self, function):
self.function = function
super().__init__(lambda x: transform_image(x, function))
def get_histogram_stat(img, stat = np.mean):
#assert is black and white and 2d
assert img.ndim == 2
return stat(img)
def to_black_and_white(img, dtype = np.int16, asarray = True, normalize_range = (0,255)):
"""
transforms the image to black and white one, with 2 channels only.
its possible to normalize and cast, if an array is returned
"""
if isinstance(img, PIL.BmpImagePlugin.BmpImageFile):
arr = np.array(img.convert('L'))
else:
arr = img
if normalize_range is None:
arr = arr.astype(dtype)
else:
arr = minmax_scale(arr, normalize_range, axis = None).astype(dtype)
if asarray:
return arr
else:
return Image.fromarray(arr)
def get_histogram_stat_vector(images, stat = np.mean, normalization_function = to_black_and_white):
"""
allows vectorized operations on ImageLoader instance or collection of images/np.arrays
"""
if isinstance(images, ImageLoader):
results = images.map(lambda img: stat(normalization_function(img)))
else:
results = list(map(lambda img: stat(normalization_function(img)), images))
return results
from PIL import Image, ImageOps
def padding(img, expected_size):
desired_size = expected_size
delta_width = desired_size - img.size[0]
delta_height = desired_size - img.size[1]
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
return ImageOps.expand(img, padding)
def resize_with_padding(img, expected_size):
img.thumbnail((expected_size[0], expected_size[1]))
# print(img.size)
delta_width = expected_size[0] - img.size[0]
delta_height = expected_size[1] - img.size[1]
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
return ImageOps.expand(img, padding)
def gray_to_rgb(img):
if img.ndim == 3:
img = img.reshape((*img.shape[:-1]))
img2 = np.zeros( ( np.array(img).shape[0], np.array(img).shape[1], 3 ) )
img2[:,:,0] = img # same value in each channel
img2[:,:,1] = img
img2[:,:,2] = img
return img2
class ImageLoader():
"""
loads images from path given a list of wildcards
"""
def __init__(self, root_dir, mask_extension = ".bmp", image_extension = ".bmp"):
label_paths = Path(root_dir).iterdir()
#set labels based on parent folder
all_items = []
for label_path in label_paths:
label = str(label_path).split("/")[-1]
mask_paths = [str(i) for i in set(label_path.glob("*_mask*"))]
image_paths = []
new_mask_paths = []
for i in range(len(mask_paths)):
mask_path = mask_paths[i]
im_paths = list(Path(label_path).rglob("*" + mask_path.split("/")[-1].split("_mask")[0] +"*"))
im_paths = [i for i in im_paths if not "_mask" in str(i)]
if len(im_paths) > 0:
new_mask_paths.append(mask_path)
image_paths.append(str(im_paths[0]))
else:
pass
#
labels = [label]*len(image_paths)
items = list(zip(image_paths, new_mask_paths, labels))
all_items += items
self.items = all_items
return
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
return {"image":Image.open(self.items[idx][0]), "mask": Image.open(self.items[idx][1]), "label":self.items[idx][2]}
def get_image_array(self, index, size = None, to_rgb = False, normalize = False, normalize_range = (0,255)):
image = Image.open(self.items[index][0])
if normalize:
image = np.array(image)
shape = image.shape
image = minmax_scale(image.ravel(), feature_range=normalize_range).reshape(shape)
image = Image.fromarray(image)
if to_rgb:
image = image.convert("RGB")
if not size is None:
image = resize_with_padding(image, size)
return np.array(image)
def get_mask_array(self, index):
mask = Image.open(self.items[index][1])
mask_array = np.array(mask)
assert len(np.unique(mask_array.flatten())) <= 2, "mask has more than two values"
mask_array = (mask_array - mask_array.min())/(mask_array.max()-mask_array.min())
return mask_array.astype(bool)
def get_label(self, index):
return self.items[index][2]
def get_masked_image(self, index):
return np.where(self.get_mask_array(index), self.get_image_array(index), 0)
def plot_masked_image(self, index, alpha = 0.5):
plt.imshow(self.get_image_array(index), cmap = "gray")
plt.imshow(self.get_masked_image(index), alpha = alpha, cmap = "gray")
return
def get_masked_flat_image(self, index, return_index = False):
image_flat = self.get_image_array(index).flatten()
mask_flat = self.get_mask_array(index).flatten()
image_flat = image_flat[mask_flat]
if not return_index:
return image_flat
else:
return image_flat, mask_flat.nonzero()[0]
def get_image_id(self, index):
return self.items[index][0].split('/')[-1].split("_FLAIR")[0]
def get_flair_id(self, index):
return self.items[index][0].split('/')[-1].split("_FLAIR")[-1].split('.')[0]
def map(self, function, attribute = 'image'):
assert attribute in (None, "image","label","mask")
if not attribute is None:
vals = [i[attribute] for i in self]
else:
vals = [i for i in self]
results = list(map(function, vals))
return results
loader = ImageLoader("../Train")
loader_test = ImageLoader("../Test/")
#all(loader.map(lambda x: np.array(x).ndim == 2)), all(loader_test.map(lambda x: np.array(x).ndim == 2))
plt.imshow(loader_test.get_image_array(0, size = (512,512), to_rgb = True))
loader_test.get_image_array(0, size = (512,512), to_rgb = True).shape
(512, 512, 3)
loader_test.plot_masked_image(100, alpha = 0.8)
loader.get_mask_array(212)
array([[False, False, False, ..., False, False, False],
[False, False, False, ..., False, False, False],
[False, False, False, ..., False, False, False],
...,
[False, False, False, ..., False, False, False],
[False, False, False, ..., False, False, False],
[False, False, False, ..., False, False, False]])
from keras.applications import EfficientNetV2S
from tensorflow.keras.preprocessing import image
import tensorflow as tf
image_size = (299,299)
pixel_range = (0,255)
feature_extractor = EfficientNetV2S(
include_top=False,
weights="imagenet",
input_tensor=None,
input_shape=(*image_size,3),
pooling="max",
#classes=1000,
#classifier_activation="softmax",
include_preprocessing=True,
)
2022-07-06 00:51:49.137094: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/alan/.local/lib/python3.8/site-packages/cv2/../../lib64: 2022-07-06 00:51:49.137143: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303) 2022-07-06 00:51:49.137173: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (alan-Lenovo-ideapad-310-14ISK): /proc/driver/nvidia/version does not exist 2022-07-06 00:51:49.137480: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
ids_train = np.array([loader.get_image_id(i) for i in range(len(loader))])
ids_test = np.array([loader_test.get_image_id(i) for i in range(len(loader_test))])
ids_all = np.hstack([ids_train, ids_test])
flairs_train = np.array([loader.get_flair_id(i) for i in range(len(loader))])
flairs_test = np.array([loader_test.get_flair_id(i) for i in range(len(loader_test))])
flairs_all = np.hstack([flairs_train, flairs_test])
labels_train = np.array([i["label"] for i in loader])
labels_test = np.array([i["label"] for i in loader_test])
labels_all = np.hstack([labels_train, labels_test])
import pandas as pd
id_df = pd.DataFrame(ids_all, columns = ["id"])
id_df["flair"] = flairs_all.astype(int)
id_df["label"] = labels_all
images_train = [
loader.get_image_array(
i,
image_size,
to_rgb = True,
normalize = True,
normalize_range=pixel_range
) for i in range(len(loader))
]
images_test = [
loader_test.get_image_array(
i,
image_size,
to_rgb = True,
normalize = True,
normalize_range=pixel_range
) for i in range(len(loader_test))
]
images_train = np.array(images_train)
images_test = np.array(images_test)
features_train = feature_extractor.predict(images_train)
2022-07-06 00:52:05.347622: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 308701653 exceeds 10% of free system memory.
36/36 [==============================] - 177s 5s/step
features_test = feature_extractor.predict(images_test)
29/29 [==============================] - 149s 5s/step
Checamos aqui se alguma classe possui uma probabilidade maior para flairs específicos. Isso pode fazer com que o modelo aprenda a discriminar a classe baseando-se no flair, o que pode ser ruim para seu poder de generalização.
Foi observado que para EM, os flairs são maiores enquanto os flairs para AVC e SLE possuem distribuição semelhante. Esse viés provavelmente ocorre no momento da coleta dos dados, em que o responsável pela imagem limita os flairs de acordo com o flair anterior
#checa viés de flair
id_df.query('label != "Test"').groupby("label").apply(lambda x: sns.distplot(x["flair"], label = x["label"].iloc[0]))
plt.legend()
plt.title("distribuição de flairs por label")
plt.savefig("./images/vies_de_flair.png", bbox_inches= "tight")
/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
import umap
reducer = umap.UMAP()
features_all = np.vstack([features_train, features_test])
labels_all = np.hstack([labels_train, labels_test])
embs = reducer.fit_transform(features_all)
import plotly.express as px
px.scatter(x = embs[:,0], y = embs[:,1], color = id_df["label"])
é possível ver que as features extraídas também são capazes de segmentar as flairs por similridade
fig = px.scatter(x = embs[:,0], y = embs[:,1], color = id_df["flair"].astype(int), symbol = id_df["label"],
width = 1000,
height = 600
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
é possível ver que a separação é ainda mais forte
from sklearn.linear_model import LogisticRegression
from sklearn.base import TransformerMixin, BaseEstimator
class LinearScaledEmbeddings(TransformerMixin, BaseEstimator):
def __init__(self, estimator):
self.estimator = estimator
def fit(self, X, y = None,**kwawrgs):
self.scaler = QuantileTransformer().fit(X)
X = self.scaler.transform(X)
self.estimator.fit(X, y,**kwawrgs)
return self
def transform(self, X, **umap_kwargs):
#X = self.scaler.transform(X)
X = self.estimator.coef_*X
X = umap.UMAP(**umap_kwargs).fit_transform(X)
return X
estimator = LogisticRegression(penalty = 'l1', solver = "saga")
scaler = LinearScaledEmbeddings(estimator).fit(features_train, labels_train)
/home/alan/.local/lib/python3.8/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
id_df["lasso_proba_max"] = scaler.estimator.predict_proba(features_all).max(1) - 0.05*(np.random.random(id_df.shape[0]))
#checa viés de flair
id_df.query('label != "Test"').groupby("label").apply(lambda x: sns.distplot(x["lasso_proba_max"], label = x["label"].iloc[0]))
plt.legend()
plt.title("distribuição de max_proba por label")
plt.ylim(1)
plt.savefig("./images/probas_lasso.png", bbox_inches= "tight")
/home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). /home/alan/.local/lib/python3.8/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
sns.jointplot(id_df["lasso_proba_max"], id_df["flair"].astype(int), alpha = 0.2, hue = id_df["label"])
/home/alan/.local/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
<seaborn.axisgrid.JointGrid at 0x7f879eb3e760>
from sklearn.metrics import auc, roc_auc_score, roc_curve, classification_report
roc_auc_score(labels_train, scaler.estimator.predict_proba(features_train)[:,1], labels = scaler.estimator.classes_)
0.9985583539355263
lasso_embs = scaler.transform(features_all)
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["lasso_proba_max"], symbol = id_df["label"],
width = 1000,
height = 600
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["flair"].astype(float), symbol = id_df["label"],
width = 1000,
height = 600
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
fig = px.scatter(x = lasso_embs[:,0], y = lasso_embs[:,1], color = id_df["label"],
width = 1000,
height = 600
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
class ForestScaledEmbeddings(TransformerMixin, BaseEstimator):
def __init__(self, estimator):
self.estimator = estimator
def fit(self, X, y = None,**kwawrgs):
self.estimator.fit(X, y,**kwawrgs)
return self
def transform(self, X, metric = "hamming", **umap_kwargs):
#X = self.scaler.transform(X)
X = self.estimator.apply(X)
X = umap.UMAP(metric = metric, **umap_kwargs).fit_transform(X)
return X
from sklearn.ensemble import RandomForestClassifier, RandomTreesEmbedding
estimator = RandomForestClassifier(min_samples_leaf=10)
forest_scaler = ForestScaledEmbeddings(estimator)
forest_scaler.fit(features_train, labels_train)
ForestScaledEmbeddings(estimator=RandomForestClassifier(min_samples_leaf=10))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ForestScaledEmbeddings(estimator=RandomForestClassifier(min_samples_leaf=10))
RandomForestClassifier(min_samples_leaf=10)
RandomForestClassifier(min_samples_leaf=10)
forest_embs = forest_scaler.transform(features_all, metric = "hamming")
/home/alan/.local/lib/python3.8/site-packages/umap/umap_.py:1802: UserWarning: gradient function is not yet implemented for hamming distance metric; inverse_transform will be unavailable
id_df["forest_proba_max"] = forest_scaler.estimator.predict_proba(features_all).max(1)
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["forest_proba_max"], symbol = id_df["label"],
width = 1000,
height = 600,
title = "Embeddings UMAP de nós terminais de RandomForest, cor representa probabilidade do estimador"
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["flair"].astype(float), symbol = id_df["label"],
width = 1000,
height = 600,
title = "Embeddings UMAP de nós terminais de RandomForest, core repesenta flair"
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
fig = px.scatter(x = forest_embs[:,0], y = forest_embs[:,1], color = id_df["label"],
width = 1000,
height = 600
)
fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=-.2,
ticks="outside"))
import plotly.express as px
images = [loader_test.get_image_array(i) for i in range(10)]
fig = px.imshow(
np.array(images),
facet_col=0,
binary_string=True,
facet_col_wrap=4,
height=500,width=500,
facet_col_spacing=0,
facet_row_spacing=0,
)
fig
def ceildiv(a, b):
return -(a // -b)
import plotly.graph_objs as go
import plotly.offline as py
import pandas as pd
import numpy as np
from ipywidgets import interactive, HBox, VBox
py.init_notebook_mode()
df = pd.read_csv('https://raw.githubusercontent.com/jonmmease/plotly_ipywidget_notebooks/master/notebooks/data/cars/cars.csv')
f = go.FigureWidget([go.Scatter(y = df['City mpg'], x = df['City mpg'], mode = 'markers')])
scatter = f.data[0]
N = len(df)
scatter.x = scatter.x + np.random.rand(N)/10 *(df['City mpg'].max() - df['City mpg'].min())
scatter.y = scatter.y + np.random.rand(N)/10 *(df['City mpg'].max() - df['City mpg'].min())
scatter.marker.opacity = 0.5
# Create a table FigureWidget that updates on selection from points in the scatter plot of f
im_kwargs = dict(
facet_col=0,
binary_string=True,
facet_col_wrap=5,
height=800,
width=1200,
facet_col_spacing=0,
facet_row_spacing=0,
)
t = px.imshow(
images_train[:20],
**im_kwargs,
)
t = go.FigureWidget(t)
import matplotlib.pyplot as plt
import seaborn as sns
def selection_fn(trace,points,selector):
with t.batch_update():
idxs = np.random.choice(range(100),size = 20, replace = False)
#new = go.FigureWidget(imshow(images_train[idxs]))
kws = {**im_kwargs, **{"height":300*ceildiv(len(idxs),5)}}
new = go.FigureWidget(px.imshow(images_train[idxs], **kws))
t.update({'data':new.data,'layout':new.layout,"frames":new.frames}, overwrite = True)
#for i in range(len(t.data)):
# t.data[i] = data[i]
scatter.on_selection(selection_fn)
# Put everything together
VBox((f,t))
VBox(children=(FigureWidget({
'data': [{'marker': {'opacity': 0.5},
'mode': 'markers',
…
ceildiv(11,3)
4